{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "import pandas as pd\n",
    "import polars_ds as pds\n",
    "import polars_ds.linear_models as pds_linear\n",
    "# Requires version >= v0.5.1\n",
    "print(pds.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "size = 50_000\n",
    "df = pds.random_data(size=size, n_cols=0).select(\n",
    "    pds.random(0.0, 1.0).alias(\"x1\"),\n",
    "    pds.random(0.0, 1.0).alias(\"x2\"),\n",
    "    pds.random(0.0, 1.0).alias(\"x3\"),\n",
    "    pds.random(0.0, 1.0).alias(\"x4\"),\n",
    "    pds.random(0.0, 1.0).alias(\"x5\"),\n",
    "    pds.random_int(0,4).alias(\"code\"),\n",
    "    pl.Series(name = \"id\", values = range(size))\n",
    ").with_columns(\n",
    "    y = pl.col(\"x1\") * 0.5 + pl.col(\"x2\") * 0.25 - pl.col(\"x3\") * 0.15 + pl.col(\"x4\") *0.2 - pl.col(\"x5\") * 0.13 + pds.random() * 0.0001,\n",
    ")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare data for Scikit-learn. We assume the Scikit-learn + NumPy combination. \n",
    "# One can simply replace to_numpy() by to_pandas() to test the Scikit-learn + Pandas combination\n",
    "from sklearn.linear_model import Lasso, Ridge, LinearRegression\n",
    "X = df.select(\"x1\", \"x2\", \"x3\", \"x4\", \"x5\").to_numpy()\n",
    "y = df.select(\"y\").to_numpy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Benchmarks \n",
    "\n",
    "I did not invent any of the algorithms that solves the linear regression problem. Not did I make any improvement to existing algorithms. I only rewrote them in Rust, using Faer, and brought the algorithms alive with Polars.\n",
    "\n",
    "1. Polars DS In-DataFrame Linear Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy LinearRegression\n",
    "2. Polars DS In-DataFrame Ridge Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Ridge\n",
    "3. Polars DS In-DataFrame Lasso Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Lasso"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Polars DS way\n",
    "print(\n",
    "    \"Polars DS: \",\n",
    "    df.select(\n",
    "        pds.lin_reg(\n",
    "            \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
    "            target = \"y\",\n",
    "            method = \"normal\",\n",
    "        )\n",
    "    ).item(0, 0)\n",
    ")\n",
    "\n",
    "# Fit is done implicitly because X and y are passed at initialization\n",
    "# You can also don't put X and y here and do a lr.fit(X,y) later.\n",
    "lr = pds_linear.LR(\n",
    "    X=X, y=y, add_bias=False, method=\"normal\"\n",
    ") \n",
    "print(\"PDS LR: \", lr.coeffs)\n",
    "\n",
    "# Sklearn\n",
    "reg = LinearRegression(fit_intercept=False)\n",
    "reg.fit(X, y)\n",
    "print(\"Sklearn: \", reg.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit \n",
    "df.select(\n",
    "    pds.lin_reg(\n",
    "        \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
    "        target = \"y\",\n",
    "        method = \"normal\",\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "lr = pds_linear.LR(\n",
    "    add_bias=False, method=\"normal\"\n",
    ")\n",
    "lr.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "reg = LinearRegression(fit_intercept=False, copy_X=False)\n",
    "reg.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Polars DS way\n",
    "print(\n",
    "    \"Polars DS: \",\n",
    "    df.select(\n",
    "        pds.lin_reg(\n",
    "            \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
    "            target = \"y\",\n",
    "            method = \"l1\",\n",
    "            l1_reg = 0.1\n",
    "        )\n",
    "    ).item(0, 0)\n",
    ")\n",
    "\n",
    "# Fit is done implicitly because X and y are passed at initialization\n",
    "# You can also don't put X and y here and do a lr.fit(X,y) later.\n",
    "lr = pds_linear.LR(\n",
    "    X=X, y=y, add_bias=False, method=\"l1\", lambda_ = 0.1,\n",
    ") \n",
    "print(\"PDS LR: \", lr.coeffs)\n",
    "\n",
    "# Sklearn\n",
    "reg = Lasso(alpha = 0.1, fit_intercept=False)\n",
    "reg.fit(X, y)\n",
    "print(\"Sklearn: \", reg.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "df.select(\n",
    "    pds.lin_reg(\n",
    "        \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
    "        target = \"y\",\n",
    "        method = \"l1\",\n",
    "        l1_reg = 0.1\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "lr = pds_linear.LR(\n",
    "    add_bias=False, method=\"l1\", lambda_=0.1\n",
    ") \n",
    "# This is faster than the in-dataframe ver because this uses NumPy data directly, which skips a copy.\n",
    "# This is faster than sklearn because the underlying linalg library is faster. The convergence criterion is also simpler, though \n",
    "# less rigourous, than sklearn's. However, you can set tol = 1e-7 and still be faster.\n",
    "lr.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "reg = Lasso(alpha = 0.1, fit_intercept=False, copy_X=False)\n",
    "reg.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Polars DS way\n",
    "print(\n",
    "    \"Polars DS: \",\n",
    "    df.select(\n",
    "        pds.lin_reg(\n",
    "            \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
    "            target = \"y\",\n",
    "            method = \"l2\",\n",
    "            l2_reg = 0.1\n",
    "        )\n",
    "    ).item(0, 0)\n",
    ")\n",
    "\n",
    "# Fit is done implicitly because X and y are passed at initialization\n",
    "# You can also don't put X and y here and do a lr.fit(X,y) later.\n",
    "lr = pds_linear.LR(\n",
    "    X=X, y=y, add_bias=False, method=\"l2\", lambda_ = 0.1,\n",
    ") \n",
    "print(\"PDS LR: \", lr.coeffs)\n",
    "\n",
    "# Sklearn\n",
    "reg = Ridge(alpha = 0.1, fit_intercept=False)\n",
    "reg.fit(X, y)\n",
    "print(\"Sklearn: \", reg.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "df.select(\n",
    "    pds.lin_reg(\n",
    "        \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
    "        target = \"y\",\n",
    "        method = \"l2\",\n",
    "        l2_reg = 0.1\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "lr = pds_linear.LR(\n",
    "    add_bias=False, method=\"l2\", lambda_=0.1\n",
    ") \n",
    "lr.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%timeit\n",
    "reg = Ridge(alpha = 0.1, fit_intercept=False, copy_X=False)\n",
    "reg.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# What you can do with Polars DS but will be hard for Scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train a linear regression model on each category. And return the predictions\n",
    "df.select(\n",
    "    pl.col(\"id\"),\n",
    "    pds.lin_reg(\n",
    "        \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
    "        target = \"y\",\n",
    "        method = \"l2\",\n",
    "        l2_reg = 0.1,\n",
    "        return_pred = True\n",
    "    ).over(\"code\").alias(\"predictions\")\n",
    ").unnest(\"predictions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train a linear regression model on each category. And return only the coefficients\n",
    "df.group_by(\"code\").agg(\n",
    "    pds.lin_reg(\n",
    "        \"x1\", \"x2\", \"x3\",\n",
    "        target = \"y\",\n",
    "        method = \"l2\",\n",
    "        l2_reg = 0.1,\n",
    "    )\n",
    ").sort(\"code\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
